1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

About Dataset This dataset is a copy of another Kaggle dataset which can be accessed here: https://www.kaggle.com/c/glioma-radiomics The difference is that I have provided ground truth for the test set (test_GT.csv).

The suffix “omics” in Medical Science is associated with analysis of big sets of features (e.g. Genomics, Proteomics). Radiomics are imaging features (e.g., first order and second order) extracted from Regions of Interest (ROI) in radiology images based on predefined functions and filters. Low grade gliomas (LGG) are a type of brain tumors. Astrocytes and Oligodendrocytes which are two types of brain cells, are considered as origins of LGG. Adult LGG are characterized by different mutations which is important to be correctly identified. With this dataset the goal is to determine if an ROI has 1p19q codeletion (Mutacion=1) or not(Mutacion=0). This plays a key role in predicting patient’s response to chemotherapy and their survival. The dataset provides 640 different radiomics features for each ROI. There are 105 ROIs in the training set and 45 ROIs in the test cohort.

1.2 The Data

https://www.kaggle.com/datasets/knamdar/radiomics-for-lgg-dataset?select=test_GT.csv


LGG_Data <- read.csv("~/GitHub/LatentBiomarkers/Data/LGG/train.csv")
LGG_DataTest <- read.csv("~/GitHub/LatentBiomarkers/Data/LGG/test.csv")
LGG_TestGT <- read.csv("~/GitHub/LatentBiomarkers/Data/LGG/test_GT.csv")
LGG_DataTest$Mutacion <- LGG_TestGT$Mutacion

LGG_Data <- rbind(LGG_Data,LGG_DataTest)

rownames(LGG_Data) <- LGG_Data$patientID
LGG_Data$patientID <- NULL

pander::pander(table(LGG_Data$Mutacion))
0 1
54 96

1.2.0.1 Standarize the names for the reporting

studyName <- "LGG"
dataframe <- LGG_Data
outcome <- "Mutacion"

TopVariables <- 10

thro <- 0.80
cexheat = 0.15

1.3 Generaring the report

1.3.1 Libraries

Some libraries

library(psych)
library(whitening)
library("vioplot")
library("rpart")

1.3.2 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
150 640
pander::pander(table(dataframe[,outcome]))
0 1
54 96

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

largeSet <- length(varlist) > 1500 

1.3.3 Scaling the data

Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns


  ### Some global cleaning
  sdiszero <- apply(dataframe,2,sd) > 1.0e-16
  dataframe <- dataframe[,sdiszero]

  varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
  tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
  dataframe <- dataframe[,tokeep]

  varlist <- colnames(dataframe)
  varlist <- varlist[varlist != outcome]
  
  iscontinous <- sapply(apply(dataframe,2,unique),length) >= 5 ## Only variables with enough samples



dataframeScaled <- FRESAScale(dataframe,method="OrderLogit")$scaledData

1.4 The heatmap of the data

numsub <- nrow(dataframe)
if (numsub > 1000) numsub <- 1000


if (!largeSet)
{

  hm <- heatMaps(data=dataframeScaled[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 xlab="Feature",
                 ylab="Sample",
                 srtCol=45,
                 srtRow=45,
                 cexCol=cexheat,
                 cexRow=cexheat
                 )
  par(op)
}

1.4.0.1 Correlation Matrix of the Data

The heat map of the data


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  #cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
  cormat <- cor(dataframe[,varlist],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Original Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.9999991

1.5 The decorrelation


DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#> 
#>  wavelet.HHH_glcm_DifferenceEntropy wavelet.HLH_gldm_DependenceNonUniformity original_glszm_GrayLevelNonUniformity wavelet.HLH_glszm_LargeAreaHighGrayLevelEmphasis wavelet.HHH_glszm_GrayLevelNonUniformityNormalized 
#>       original_firstorder_10Percentile       original_firstorder_90Percentile 
#>                             0.20853081                             0.23380727 
#>             original_firstorder_Energy            original_firstorder_Entropy 
#>                             0.36966825                             0.35071090 
#> original_firstorder_InterquartileRange           original_firstorder_Kurtosis 
#>                             0.38862559                             0.03159558 
#> 
#>  Included: 633 , Uni p: 0.0002369668 , Base Size: 5 , Rcrit: 0.281919 
#> 
#> 
 1 <R=1.000,thr=0.950>, Top: 83< 9 >[Fa= 83 ]( 83 , 288 , 0 ),<|><>Tot Used: 371 , Added: 288 , Zero Std: 0 , Max Cor: 1.000
#> 
 2 <R=1.000,thr=0.950>, Top: 51< 1 >[Fa= 133 ]( 51 , 111 , 83 ),<|><>Tot Used: 436 , Added: 111 , Zero Std: 0 , Max Cor: 0.999
#> 
 3 <R=0.999,thr=0.950>, Top: 11< 11 >[Fa= 144 ]( 11 , 23 , 133 ),<|><>Tot Used: 452 , Added: 23 , Zero Std: 0 , Max Cor: 0.967
#> 
 4 <R=0.967,thr=0.950>, Top: 2< 1 >[Fa= 146 ]( 2 , 2 , 144 ),<|><>Tot Used: 452 , Added: 2 , Zero Std: 0 , Max Cor: 0.949
#> 
 5 <R=0.949,thr=0.900>, Top: 62< 1 >[Fa= 167 ]( 61 , 88 , 146 ),<|><>Tot Used: 490 , Added: 88 , Zero Std: 0 , Max Cor: 1.000
#> 
 6 <R=1.000,thr=0.950>, Top: 7< 1 >[Fa= 169 ]( 7 , 7 , 167 ),<|><>Tot Used: 490 , Added: 7 , Zero Std: 0 , Max Cor: 0.944
#> 
 7 <R=0.944,thr=0.900>, Top: 12< 1 >[Fa= 171 ]( 11 , 15 , 169 ),<|><>Tot Used: 496 , Added: 15 , Zero Std: 0 , Max Cor: 1.000
#> 
 8 <R=1.000,thr=0.950>, Top: 1< 1 >[Fa= 171 ]( 1 , 1 , 171 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.935
#> 
 9 <R=0.935,thr=0.900>, Top: 1< 1 >[Fa= 171 ]( 1 , 1 , 171 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#> 
 10 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 171 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#> 
 11 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#> 
 12 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#> 
 13 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#> 
 14 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#> 
 15 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#> 
 16 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#> 
 17 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#> 
 18 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#> 
 19 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#> 
 20 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.996
#> 
 21 <R=0.996,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.994
#> 
 22 <R=0.994,thr=0.950>, Top: 1< 1 >[Fa= 172 ]( 1 , 1 , 172 ),<|><>Tot Used: 496 , Added: 1 , Zero Std: 0 , Max Cor: 0.899
#> 
 23 <R=0.899,thr=0.800>, Top: 81< 1 >[Fa= 200 ]( 76 , 102 , 172 ),<|><>Tot Used: 531 , Added: 102 , Zero Std: 0 , Max Cor: 0.997
#> 
 24 <R=0.997,thr=0.950>, Top: 9< 1 >[Fa= 206 ]( 9 , 9 , 200 ),<|><>Tot Used: 533 , Added: 9 , Zero Std: 0 , Max Cor: 1.000
#> 
 25 <R=1.000,thr=0.950>, Top: 2< 1 >[Fa= 207 ]( 2 , 2 , 206 ),<|><>Tot Used: 533 , Added: 2 , Zero Std: 0 , Max Cor: 0.980
#> 
 26 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#> 
 27 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#> 
 28 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#> 
 29 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#> 
 30 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#> 
 31 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#> 
 32 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#> 
 33 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#> 
 34 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#> 
 35 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#> 
 36 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#> 
 37 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#> 
 38 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#> 
 39 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#> 
 40 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#> 
 41 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.980
#> 
 42 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.975
#> 
 43 <R=0.975,thr=0.950>, Top: 1< 1 >[Fa= 207 ]( 1 , 1 , 207 ),<|><>Tot Used: 533 , Added: 1 , Zero Std: 0 , Max Cor: 0.949
#> 
 44 <R=0.949,thr=0.900>, Top: 6< 1 >[Fa= 208 ]( 6 , 6 , 207 ),<|><>Tot Used: 533 , Added: 6 , Zero Std: 0 , Max Cor: 0.899
#> 
 45 <R=0.899,thr=0.800>, Top: 25< 1 >[Fa= 217 ]( 22 , 30 , 208 ),<|><>Tot Used: 548 , Added: 30 , Zero Std: 0 , Max Cor: 0.959
#> 
 46 <R=0.959,thr=0.950>, Top: 1< 1 >[Fa= 217 ]( 1 , 1 , 217 ),<|><>Tot Used: 548 , Added: 1 , Zero Std: 0 , Max Cor: 0.893
#> 
 47 <R=0.893,thr=0.800>, Top: 11< 1 >[Fa= 221 ]( 11 , 18 , 217 ),<|><>Tot Used: 556 , Added: 18 , Zero Std: 0 , Max Cor: 0.866
#> 
 48 <R=0.866,thr=0.800>, Top: 4< 1 >[Fa= 221 ]( 4 , 4 , 221 ),<|><>Tot Used: 556 , Added: 4 , Zero Std: 0 , Max Cor: 0.839
#> 
 49 <R=0.839,thr=0.800>, Top: 1< 1 >[Fa= 222 ]( 1 , 1 , 221 ),<|><>Tot Used: 556 , Added: 1 , Zero Std: 0 , Max Cor: 0.800
#> 
 50 <R=0.800,thr=0.800>
#> 
 [ 50 ], 0.7995444 Decor Dimension: 556 Nused: 556 . Cor to Base: 312 , ABase: 633 , Outcome Base: 0 
#> 
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]

pander::pander(sum(apply(dataframe[,varlist],2,var)))

6.77e+22

pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))

6.77e+22

pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))

0.000131

pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))

0.000131


varratio <- attr(DEdataframe,"VarRatio")

pander::pander(tail(varratio))
La_wavelet.HHL_glcm_ClusterTendency La_wavelet.HHH_glcm_ClusterTendency La_wavelet.LHH_glcm_Contrast La_wavelet.LHH_firstorder_Maximum La_wavelet.HHH_firstorder_Maximum La_wavelet.HLH_firstorder_Maximum
3.54e-30 2.21e-30 1.58e-30 1.16e-30 9.35e-31 7.84e-31

1.5.1 The decorrelation matrix


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  
  UPLTM <- attr(DEdataframe,"UPLTM")
  
  gplots::heatmap.2(1.0*(abs(UPLTM)>0),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Decorrelation matrix",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="|Beta|>0",
                    xlab="Output Feature", ylab="Input Feature")
  
  par(op)
  
  
  
}

1.5.2 Formulas Network

Displaying the features associations

par(op)


#if ((ncol(dataframe) < 1000) && (ncol(dataframe) > 10))
#{

#  DEdataframeB <- ILAA(dataframe,verbose=TRUE,thr=thro,bootstrap=30)

  transform <- attr(DEdataframe,"UPLTM") != 0
  tnames <- colnames(transform)
  colnames(transform) <- str_remove_all(colnames(transform),"La_")
  transform <- abs(transform*cor(dataframe[,rownames(transform)])) # The weights are proportional to the observed correlation
  
  
  VertexSize <- attr(DEdataframe,"fscore") # The size depends on the variable independence relevance (fscore)
  names(VertexSize) <- str_remove_all(names(VertexSize),"La_")
  VertexSize <- 10*(VertexSize-min(VertexSize))/(max(VertexSize)-min(VertexSize)) # Normalization

  VertexSize <- VertexSize[rownames(transform)]
  rsum <- apply(1*(transform !=0),1,sum) + 0.01*VertexSize + 0.001*varratio[tnames]
  csum <- apply(1*(transform !=0),2,sum) + 0.01*VertexSize + 0.001*varratio[tnames]
  
  ntop <- min(10,length(rsum))


  topfeatures <- unique(c(names(rsum[order(-rsum)])[1:ntop],names(csum[order(-csum)])[1:ntop]))
  rtrans <- transform[topfeatures,]
  csum <- (apply(1*(rtrans !=0),2,sum) > 1)
  rtrans <- rtrans[,csum]
  topfeatures <- unique(c(topfeatures,colnames(rtrans)))
  print(ncol(transform))
#> [1] 556
  transform <- transform[topfeatures,topfeatures]
  print(ncol(transform))
#> [1] 22
  if (ncol(transform)>100)
  {
    csum <- (apply(1*(transform !=0),2,sum) > 1) & (apply(1*(transform !=0),1,sum) > 1)
    transform <- transform[csum,csum]
    print(ncol(transform))
  }

    if (ncol(transform) < 150)
    {

      gplots::heatmap.2(transform,
                        trace = "none",
                        mar = c(5,5),
                        col=rev(heat.colors(5)),
                        main = "Red Decorrelation matrix",
                        cexRow = cexheat,
                        cexCol = cexheat,
                       srtCol=45,
                       srtRow=45,
                        key.title=NA,
                        key.xlab="|Beta|>0",
                        xlab="Output Feature", ylab="Input Feature")
  
      par(op)
      VertexSize <- VertexSize[colnames(transform)]
      gr <- graph_from_adjacency_matrix(transform,mode = "directed",diag = FALSE,weighted=TRUE)
      gr$layout <- layout_with_fr
      
      fc <- cluster_optimal(gr)
      plot(fc, gr,
           edge.width = 2*E(gr)$weight,
           vertex.size=VertexSize,
           edge.arrow.size=0.5,
           edge.arrow.width=0.5,
           vertex.label.cex=(0.15+0.05*VertexSize),
           vertex.label.dist=0.5 + 0.05*VertexSize,
           main="Top Feature Association")
      
    }


par(op)

1.6 The heatmap of the decorrelated data

if (!largeSet)
{

  hm <- heatMaps(data=DEdataframe[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 cexRow = cexheat,
                 cexCol = cexheat,
                 srtCol=45,
                 srtRow=45,
                 xlab="Feature",
                 ylab="Sample")
  par(op)
}

1.7 The correlation matrix after decorrelation

if (!largeSet)
{

  cormat <- cor(DEdataframe[,varlistc],method="pearson")
  cormat[is.na(cormat)] <- 0
  
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Correlation after ILAA",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  
  par(op)
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.7995444

1.8 U-MAP Visualization of features

1.8.1 The UMAP on Raw Data


  classes <- unique(dataframe[1:numsub,outcome])
  raincolors <- rainbow(length(classes))
  names(raincolors) <- classes
  topvars <- univariate_BinEnsemble(dataframe,outcome)
  lso <- LASSO_MIN(formula(paste(outcome,"~.")),dataframe,family="binomial")
  topvars <- unique(c(names(topvars),lso$selectedfeatures))
  pander::pander(head(topvars))

original_firstorder_Skewness, original_glcm_ClusterShade, original_gldm_LargeDependenceHighGrayLevelEmphasis, wavelet.HLL_glcm_ClusterShade, original_firstorder_Median and original_glcm_MaximumProbability

#  names(topvars)
#if (nrow(dataframe) < 1000)
#{
  datasetframe.umap = umap(scale(dataframe[1:numsub,topvars]),n_components=2)
#  datasetframe.umap = umap(dataframe[1:numsub,varlist],n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
  text(datasetframe.umap$layout,labels=dataframe[1:numsub,outcome],col=raincolors[dataframe[1:numsub,outcome]+1])

#}

1.8.2 The decorralted UMAP

  varlistcV <- names(varratio[varratio >= 0.025])
  topvars <- univariate_BinEnsemble(DEdataframe[,varlistcV],outcome)
  lso <- LASSO_MIN(formula(paste(outcome,"~.")),DEdataframe,family="binomial")
  topvars <- unique(c(names(topvars),lso$selectedfeatures))
  pander::pander(head(topvars))

La_original_glcm_Idmn, original_firstorder_Skewness, original_glcm_ClusterShade, original_gldm_LargeDependenceHighGrayLevelEmphasis, La_original_glcm_Contrast and wavelet.HLL_glcm_ClusterShade


  varlistcV <- varlistcV[varlistcV != outcome]
  
#  DEdataframe[,outcome] <- as.numeric(DEdataframe[,outcome])
#if (nrow(dataframe) < 1000)
#{
  datasetframe.umap = umap(scale(DEdataframe[1:numsub,topvars]),n_components=2)
#  datasetframe.umap = umap(DEdataframe[1:numsub,varlistcV],n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After ILAA",t='n')
  text(datasetframe.umap$layout,labels=DEdataframe[1:numsub,outcome],col=raincolors[DEdataframe[1:numsub,outcome]+1])

#}

1.9 Univariate Analysis

1.9.1 Univariate



univarRAW <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               dataframe,
               rankingTest="AUC")

100 : wavelet.HHH_firstorder_10Percentile 200 : wavelet.HHL_firstorder_Uniformity 300 : wavelet.HLH_glcm_Imc1 400 : wavelet.HLL_gldm_GrayLevelVariance 500 : wavelet.LHH_glrlm_LongRunLowGrayLevelEmphasis
600 : wavelet.LHL_glszm_LargeAreaHighGrayLevelEmphasis




univarDe <- uniRankVar(varlistc,
               paste(outcome,"~1"),
               outcome,
               DEdataframe,
               rankingTest="AUC",
               )

100 : La_wavelet.HHH_firstorder_10Percentile 200 : La_wavelet.HHL_firstorder_Uniformity 300 : La_wavelet.HLH_glcm_Imc1 400 : La_wavelet.HLL_gldm_GrayLevelVariance 500 : La_wavelet.LHH_glrlm_LongRunLowGrayLevelEmphasis
600 : La_wavelet.LHL_glszm_LargeAreaHighGrayLevelEmphasis

1.9.2 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##top variables
topvar <- c(1:length(varlist)) <= TopVariables
tableRaw <- univarRAW$orderframe[topvar,univariate_columns]
pander::pander(tableRaw)
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
original_firstorder_Skewness -3.93e-02 5.28e-01 -6.84e-01 6.34e-01 9.78e-01 0.780
original_glcm_ClusterShade 7.42e+02 1.02e+04 -1.06e+04 1.65e+04 4.41e-02 0.755
wavelet.HLL_glcm_ClusterShade -7.99e+02 3.12e+03 2.32e+03 4.29e+03 4.72e-02 0.737
wavelet.HLL_firstorder_Skewness -1.05e-02 5.50e-01 5.21e-01 8.55e-01 5.54e-02 0.736
original_firstorder_Median 2.77e+02 5.52e+01 3.33e+02 8.63e+01 6.29e-01 0.732
original_glcm_MaximumProbability 6.19e-03 2.32e-03 1.02e-02 6.26e-03 7.10e-02 0.722
original_gldm_LargeDependenceHighGrayLevelEmphasis 1.36e+04 9.87e+03 3.19e+04 3.44e+04 2.54e-03 0.711
original_glcm_JointEnergy 1.72e-03 7.60e-04 2.89e-03 2.23e-03 1.21e-02 0.710
original_firstorder_Mean 2.75e+02 5.33e+01 3.20e+02 7.86e+01 3.40e-01 0.702
original_glszm_LargeAreaHighGrayLevelEmphasis 2.34e+04 4.63e+04 1.77e+05 7.31e+05 1.50e-08 0.700


topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]


pander::pander(finalTable)
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
original_firstorder_Skewness -3.93e-02 5.28e-01 -6.84e-01 6.34e-01 0.9782 0.780
La_original_glcm_Idmn 6.35e-01 1.44e-03 6.33e-01 2.85e-03 0.4863 0.766
original_glcm_ClusterShade 7.42e+02 1.02e+04 -1.06e+04 1.65e+04 0.0441 0.755
La_original_firstorder_Energy -6.46e+07 1.25e+08 6.10e+07 1.57e+08 0.0136 0.742
La_wavelet.HLL_firstorder_InterquartileRange 1.31e+00 1.81e+00 -4.36e-01 2.71e+00 0.2171 0.742
La_wavelet.LHL_glszm_GrayLevelNonUniformity 5.38e+00 4.68e+00 2.30e+00 3.15e+00 0.4485 0.739
wavelet.HLL_glcm_ClusterShade -7.99e+02 3.12e+03 2.32e+03 4.29e+03 0.0472 0.737
La_wavelet.HLL_glszm_GrayLevelNonUniformity 4.79e+00 4.40e+00 1.24e+00 3.81e+00 0.9080 0.733
La_original_glcm_Contrast -1.43e+02 1.58e+01 -1.23e+02 3.50e+01 0.2274 0.730
La_wavelet.HLL_glrlm_RunLengthNonUniformity 4.99e+01 3.00e+01 2.62e+01 3.48e+01 0.0971 0.730

dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")


pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
mean total fraction
2.52 499 0.788

theCharformulas <- attr(dc,"LatentCharFormulas")


finalTable <- rbind(finalTable,tableRaw[topvar[!(topvar %in% topLAvar)],univariate_columns])


orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- theCharformulas[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]

Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")

finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
  DecorFormula caseMean caseStd controlMean controlStd controlKSP ROCAUC RAWAUC fscores
original_firstorder_Skewness NA -3.93e-02 5.28e-01 -6.84e-01 6.34e-01 9.78e-01 0.780 0.780 0
original_firstorder_Skewness1 NA -3.93e-02 5.28e-01 -6.84e-01 6.34e-01 9.78e-01 0.780 NA NA
La_original_glcm_Idmn + original_glcm_Idmn - (0.379)original_glcm_Idn 6.35e-01 1.44e-03 6.33e-01 2.85e-03 4.86e-01 0.766 0.542 -1
original_glcm_ClusterShade NA 7.42e+02 1.02e+04 -1.06e+04 1.65e+04 4.41e-02 0.755 0.755 0
original_glcm_ClusterShade1 NA 7.42e+02 1.02e+04 -1.06e+04 1.65e+04 4.41e-02 0.755 NA NA
La_original_firstorder_Energy + original_firstorder_Energy - (4.34e+05)wavelet.HLL_glrlm_RunLengthNonUniformity -6.46e+07 1.25e+08 6.10e+07 1.57e+08 1.36e-02 0.742 0.499 -1
La_wavelet.HLL_firstorder_InterquartileRange + wavelet.HLL_firstorder_InterquartileRange - (2.272)wavelet.HLL_firstorder_RobustMeanAbsoluteDeviation 1.31e+00 1.81e+00 -4.36e-01 2.71e+00 2.17e-01 0.742 0.543 -1
La_wavelet.LHL_glszm_GrayLevelNonUniformity - (0.016)wavelet.HLL_glrlm_RunLengthNonUniformity + wavelet.LHL_glszm_GrayLevelNonUniformity 5.38e+00 4.68e+00 2.30e+00 3.15e+00 4.48e-01 0.739 0.607 -1
wavelet.HLL_glcm_ClusterShade NA -7.99e+02 3.12e+03 2.32e+03 4.29e+03 4.72e-02 0.737 0.737 0
wavelet.HLL_glcm_ClusterShade1 NA -7.99e+02 3.12e+03 2.32e+03 4.29e+03 4.72e-02 0.737 NA NA
wavelet.HLL_firstorder_Skewness NA -1.05e-02 5.50e-01 5.21e-01 8.55e-01 5.54e-02 0.736 0.736 NA
La_wavelet.HLL_glszm_GrayLevelNonUniformity - (0.016)wavelet.HLL_glrlm_RunLengthNonUniformity + wavelet.HLL_glszm_GrayLevelNonUniformity 4.79e+00 4.40e+00 1.24e+00 3.81e+00 9.08e-01 0.733 0.613 -1
original_firstorder_Median NA 2.77e+02 5.52e+01 3.33e+02 8.63e+01 6.29e-01 0.732 0.732 NA
La_original_glcm_Contrast + original_glcm_Contrast - (34.029)original_glcm_DifferenceAverage -1.43e+02 1.58e+01 -1.23e+02 3.50e+01 2.27e-01 0.730 0.527 0
La_wavelet.HLL_glrlm_RunLengthNonUniformity - (0.126)original_shape_MeshVolume + (4.206)wavelet.HLL_glrlm_GrayLevelNonUniformity + wavelet.HLL_glrlm_RunLengthNonUniformity 4.99e+01 3.00e+01 2.62e+01 3.48e+01 9.71e-02 0.730 0.565 4
original_glcm_MaximumProbability NA 6.19e-03 2.32e-03 1.02e-02 6.26e-03 7.10e-02 0.722 0.722 NA
original_gldm_LargeDependenceHighGrayLevelEmphasis NA 1.36e+04 9.87e+03 3.19e+04 3.44e+04 2.54e-03 0.711 0.711 0
original_glcm_JointEnergy NA 1.72e-03 7.60e-04 2.89e-03 2.23e-03 1.21e-02 0.710 0.710 NA
original_firstorder_Mean NA 2.75e+02 5.33e+01 3.20e+02 7.86e+01 3.40e-01 0.702 0.702 NA
original_glszm_LargeAreaHighGrayLevelEmphasis NA 2.34e+04 4.63e+04 1.77e+05 7.31e+05 1.50e-08 0.700 0.700 NA

1.10 Comparing ILAA vs PCA vs EFA

1.10.1 PCA

featuresnames <- colnames(dataframe)[colnames(dataframe) != outcome]
pc <- prcomp(dataframe[,iscontinous],center = TRUE,scale. = TRUE)   #principal components
predPCA <- predict(pc,dataframe[,iscontinous])
PCAdataframe <- as.data.frame(cbind(predPCA,dataframe[,!iscontinous]))
colnames(PCAdataframe) <- c(colnames(predPCA),colnames(dataframe)[!iscontinous]) 
#plot(PCAdataframe[,colnames(PCAdataframe)!=outcome],col=dataframe[,outcome],cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)

#pander::pander(pc$rotation)


PCACor <- cor(PCAdataframe[,colnames(PCAdataframe) != outcome])


  gplots::heatmap.2(abs(PCACor),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "PCA Correlation",
                    cexRow = 0.5,
                    cexCol = 0.5,
                     srtCol=45,
                     srtRow= -45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")

1.10.2 EFA


EFAdataframe <- dataframeScaled

if (length(iscontinous) < 2000)
{
  topred <- min(length(iscontinous),nrow(dataframeScaled),ncol(predPCA)/2)
  if (topred < 2) topred <- 2
  
  uls <- fa(dataframeScaled[,iscontinous],nfactors=topred,rotate="varimax",warnings=FALSE)  # EFA analysis
  predEFA <- predict(uls,dataframeScaled[,iscontinous])
  EFAdataframe <- as.data.frame(cbind(predEFA,dataframeScaled[,!iscontinous]))
  colnames(EFAdataframe) <- c(colnames(predEFA),colnames(dataframeScaled)[!iscontinous]) 


  
  EFACor <- cor(EFAdataframe[,colnames(EFAdataframe) != outcome])
  
  
    gplots::heatmap.2(abs(EFACor),
                      trace = "none",
    #                  scale = "row",
                      mar = c(5,5),
                      col=rev(heat.colors(5)),
                      main = "EFA Correlation",
                      cexRow = 0.5,
                      cexCol = 0.5,
                       srtCol=45,
                       srtRow= -45,
                      key.title=NA,
                      key.xlab="Pearson Correlation",
                      xlab="Feature", ylab="Feature")
}

1.11 Effect on CAR modeling

par(op)
par(xpd = TRUE)
dataframe[,outcome] <- factor(dataframe[,outcome])
rawmodel <- rpart(paste(outcome,"~."),dataframe,control=rpart.control(maxdepth=3))
pr <- predict(rawmodel,dataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(rawmodel,main="Raw",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(rawmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,dataframe[,outcome]==0))
  }


pander::pander(table(dataframe[,outcome],pr))
  0 1
0 46 8
1 9 87
pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.887 0.825 0.933
3 se 0.906 0.829 0.956
4 sp 0.852 0.729 0.934
6 diag.or 55.583 20.099 153.711

par(op)
par(xpd = TRUE)
DEdataframe[,outcome] <- factor(DEdataframe[,outcome])
IDeAmodel <- rpart(paste(outcome,"~."),DEdataframe[,c(outcome,varlistcV)],control=rpart.control(maxdepth=3))
pr <- predict(IDeAmodel,DEdataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(IDeAmodel,main="ILAA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(IDeAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,DEdataframe[,outcome]==0))
  }

pander::pander(table(DEdataframe[,outcome],pr))
  0 1
0 42 12
1 7 89
pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.873 0.809 0.922
3 se 0.927 0.856 0.970
4 sp 0.778 0.644 0.880
6 diag.or 44.500 16.342 121.177

par(op)
par(xpd = TRUE)
PCAdataframe[,outcome] <- factor(PCAdataframe[,outcome])
PCAmodel <- rpart(paste(outcome,"~."),PCAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(PCAmodel,PCAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
  plot(PCAmodel,main="PCA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
  text(PCAmodel, use.n = TRUE,cex=0.75)
  ptab <- epiR::epi.tests(table(pr==0,PCAdataframe[,outcome]==0))
}

pander::pander(table(PCAdataframe[,outcome],pr))
  0 1
0 38 16
1 14 82
pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.800 0.727 0.861
3 se 0.854 0.767 0.918
4 sp 0.704 0.564 0.820
6 diag.or 13.911 6.164 31.392


par(op)

1.11.1 EFA


  EFAdataframe[,outcome] <- factor(EFAdataframe[,outcome])
  EFAmodel <- rpart(paste(outcome,"~."),EFAdataframe,control=rpart.control(maxdepth=3))
  pr <- predict(EFAmodel,EFAdataframe,type = "class")
  
  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(EFAmodel,main="EFA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(EFAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,EFAdataframe[,outcome]==0))
  }


  pander::pander(table(EFAdataframe[,outcome],pr))
  0 1
0 27 27
1 3 93
  pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.800 0.727 0.861
3 se 0.969 0.911 0.994
4 sp 0.500 0.361 0.639
6 diag.or 31.000 8.728 110.102
  par(op)